import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
import numpy as np

%matplotlib inline


df_unencoded = pd.read_excel("data.xlsx")

df_unencoded.head(3)


df_unencoded.info()
df_unencoded = df_unencoded.dropna(how='any')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1998 entries, 0 to 1997
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   size_m2            1998 non-null   int64 
 1   bathrooms          1998 non-null   int64 
 2   meters_from_metro  1998 non-null   int64 
 3   has_garage         1997 non-null   object
 4   property_type      1998 non-null   object
 5   price              1998 non-null   int64 
dtypes: int64(4), object(2)
memory usage: 93.8+ KB


unique_values = df_unencoded['bathrooms'].unique()
unique_values = np.sort(unique_values)
bin_edges = np.convolve(unique_values, np.array([0.5, 0.5]), 'valid')
bin_edges = np.concatenate((np.array([unique_values.min() - 0.5]), bin_edges, np.array([unique_values.max() + 0.5])))
histogram = sns.histplot(data=df_unencoded, x="bathrooms", bins=bin_edges)


histogram = sns.histplot(data=df_unencoded, x="size_m2")


sns.scatterplot(x="size_m2",
                y="price",
                hue="property_type",
                data=df_unencoded)

<AxesSubplot:xlabel='size_m2', ylabel='price'>


sns.scatterplot(x="meters_from_metro",
                y="price",
                hue="property_type",
                data=df_unencoded)

<AxesSubplot:xlabel='meters_from_metro', ylabel='price'>


sns.heatmap(df_unencoded.corr(), 
           cmap='Reds',
           annot=True)
plt.title('Correlation Matrix');


def encode(df):
    df['has_garage'] = df['has_garage'].replace(['yes'],1)
    df['has_garage'] = df['has_garage'].replace(['no'],0)

    dummies = pd.get_dummies(df.property_type) # creates 3 new binary columns for the 3 towns
    df = pd.concat([df, dummies],axis='columns')

    df = df.drop(['property_type', 'terraced'], axis=1) # if there are more than two one hot enconded columns, drop one
    
    return df

df_encoded = encode(df_unencoded)

df_encoded


X = df_encoded[['size_m2', 'bathrooms', 'meters_from_metro', 'has_garage', 'detached', 'semi-detached']]
y = df_encoded['price']

model = LinearRegression()

cv_scores = cross_val_score(model, X, y, cv=5)
print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", cv_scores.mean())

model.fit(X, y)
print("The coefficients are: " + str(model.coef_))
print("The intercept/constant is: " + str(model.intercept_))

Cross-validation scores: [0.86022103 0.85712256 0.85362275 0.84165308 0.82590028]
Mean cross-validation score: 0.8477039394082991
The coefficients are: [  3470.26184982  21463.6107701    -131.99294886   5652.71566282
 109705.20322427  23904.38558323]
The intercept/constant is: 51818.0074061433


df_new_unencoded = pd.read_csv('data_to_predict_on.csv')

df_new_encoded = encode(df_new_unencoded)

df_new_encoded.head()


prediction = model.predict(df_new_encoded)

df_new_unencoded['predicted_price'] = prediction.round(2)
df_new_unencoded.head()

	size_m2	bathrooms	meters_from_metro	has_garage	price	detached	semi-detached
0	72	3	411	1	358356	1	0
1	57	1	46	0	310489	1	0
2	67	2	1281	0	156408	0	0
3	58	1	928	1	294774	1	0
4	112	1	73	0	468561	0	1
...	...	...	...	...	...	...	...
1993	80	2	999	1	249165	0	1
1994	93	1	834	0	386007	0	0
1995	70	2	293	0	415181	1	0
1996	56	2	687	1	215375	0	1
1997	69	1	522	0	241538	0	0

This notebook contains a simple regression analysis that predicts house prices. This analysis is performed on fabricated data I created.¶

Import modules and data¶

Examine data and drop rows with null values¶

Visualise data¶

Histograms¶

Scatterplots¶

Correlation Matrix¶

Encode features¶

Fit and evaluate model (with 5 folds)¶

Use model to predict house prices¶

	size_m2	bathrooms	meters_from_metro	has_garage	property_type	predicted_price
0	100	3	207	1	semi-detached	465469.59
1	121	3	356	0	detached	598826.24
2	90	2	201	0	detached	490243.42
3	59	1	692	0	terraced	186687.95
4	61	1	482	1	terraced	226999.71